library(caret)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(Synth)
## Warning: package 'Synth' was built under R version 4.3.3
## ##
## ## Synth Package: Implements Synthetic Control Methods.
## ## See https://web.stanford.edu/~jhain/synthpage.html for additional information.
library(geosphere)
## Warning: package 'geosphere' was built under R version 4.3.3
library(estimatr)
## Warning: package 'estimatr' was built under R version 4.3.3
library(fixest)
## Warning: package 'fixest' was built under R version 4.3.3
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.3.2
df_combined <- read_csv("df_combined_complete.csv")
## Rows: 121693 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): town_code, month, dia, station_name, address, station_type, nombr...
## dbl (22): provincia, station_code.x, punto_muestreo, year, air_quality, cit...
## date (1): fecha
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_combined <- df_combined |> drop_na(air_quality)
df_combined<- df_combined |>
filter(as.Date(fecha) <= as.Date("2021-06-30"))
#air quality monitoring stations in Madrid are divided into four zones under current regulations
df_combined <- df_combined |>
mutate(zone = case_when(
station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla",
"Ramón y Cajal", "Cuatro Caminos", "Plaza de España",
"Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro",
"Parque del Retiro") ~ "Zone 1",
station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zone 2",
station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada",
"Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
station_name %in% c("Valdemoro", "Arganda del Rey", "Collado Villalba", "Algete", "Colmenar Viejo", "Alcala de Henares") ~ "Control",
TRUE ~ "Excluded"
))
I again confirm that the assumption of parallel trends does not hold, using monthly data.
#MONTHLY
monthly_avg <- df_combined %>%
group_by(year, month, zone) %>%
summarise(mean_air_quality = mean(air_quality, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
monthly_avg <- monthly_avg %>%
mutate(date = as.Date(paste(year, month, "01", sep = "-"), format = "%Y-%m-%d"))
ggplot(monthly_avg, aes(x = date, y = mean_air_quality, color = zone)) +
geom_line() +
geom_vline(xintercept = as.Date("2018-11-01"), linetype = "dashed", color = "red", size = 1) +
labs(title = "Serie Temporal de la Contaminación Media Mensual por Año y Tratamiento",
x = "Fecha",
y = "Contaminación Media (air_quality)",
color = "zone") +
theme_minimal() +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Map to visualize monitoring stations
#MAP
color_pal <- colorFactor(
palette = c("red", "grey", "lightblue", "orange", "purple", "yellow"),
domain = c("Control", "Excluded", "Zone 1", "Zone 2", "Zone 3", "Zone 4")
)
map <- leaflet(df_combined) |>
addProviderTiles(providers$CartoDB.Positron) |>
setView(lng = median(df_combined$longitude), lat = median(df_combined$latitude), zoom = 12) |>
# Añadir círculos con color basado en la variable 'zone'
addCircles(
lng = ~longitude,
lat = ~latitude,
color = ~color_pal(zone),
fillOpacity = 0.7,
radius = 5
) |>
addLabelOnlyMarkers(
lng = median(df_combined$longitude) - 0.3,
lat = median(df_combined$latitude) + 0.1,
labelOptions = labelOptions(textsize = "20px", noHide = TRUE, textOnly = TRUE)
) |>
addLegend(
position = "topright",
pal = color_pal,
values = c("Zone 1", "Zone 2", "Zone 3", "Zone 4", "Excluded", "Control"),
title = "Zone",
opacity = 1
)
# Mostrar el mapa
map
1.1 MADRID MUNICIPALITY
df_combined <- df_combined |>
filter(zone != "Excluded")
monthly_avg <- df_combined |>
group_by(year, month, zone) |>
summarise(mean_air_quality = mean(air_quality, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
df_combined <- df_combined |>
filter(!(fecha >= as.Date("2020-02-01") & fecha <= as.Date("2020-07-31"))) #omitted covid period
df_combined <- df_combined |> filter(as.Date(fecha) <= as.Date("2021-06-30"))
model0 <- feols(air_quality ~ city_center + post_treat_mad_central + city_center*post_treat_mad_central + prec + dir + velmedia + racha +year + zone, data = df_combined, cluster = ~zone)
## The variable 'zoneZone 4' has been removed because of collinearity (see $collin.var).
summary(model0)
## OLS estimation, Dep. Var.: air_quality
## Observations: 43,798
## Standard-errors: Clustered (zone)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7488.794175 344.801972 21.719116 2.6587e-05
## city_center -8.517192 0.560793 -15.187766 1.0958e-04
## post_treat_mad_central 5.384099 0.384141 14.015927 1.5034e-04
## prec -0.071051 0.030686 -2.315440 8.1544e-02
## dir 0.001942 0.007973 0.243523 8.1958e-01
## velmedia -2.595358 0.161292 -16.091069 8.7239e-05
## racha -1.214888 0.202946 -5.986253 3.9153e-03
## year -3.690046 0.170816 -21.602521 2.7162e-05
## zoneZone 1 22.343871 0.237664 94.014371 7.6744e-08
## zoneZone 2 23.281388 0.253276 91.920995 8.3975e-08
## zoneZone 3 18.740243 0.510002 36.745455 3.2749e-06
## city_center:post_treat_mad_central -1.344124 0.706985 -1.901206 1.3006e-01
##
## (Intercept) ***
## city_center ***
## post_treat_mad_central ***
## prec .
## dir
## velmedia ***
## racha **
## year ***
## zoneZone 1 ***
## zoneZone 2 ***
## zoneZone 3 ***
## city_center:post_treat_mad_central
## ... 1 variable was removed because of collinearity (zoneZone 4)
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 16.2 Adj. R2: 0.327826
1.2 Assesing heterogeneity treatment effects. Differences in differences approach by zone.
df_combined<- df_combined |>
mutate(year = as.factor(year), zone = as.factor(zone), post = as.factor(post_treat_mad_central))
df_combined <- df_combined |>
mutate(
post = as.numeric(as.character(post)),
year = as.numeric(as.character(year))
)
df_combined <- df_combined |>
mutate(
treatment_zone1 = ifelse(zone == "Zone 1", 1, 0),
treatment_zone2 = ifelse(zone == "Zone 2", 1, 0),
treatment_zone3 = ifelse(zone == "Zone 3", 1, 0),
treatment_zone4 = ifelse(zone == "Zone 4", 1, 0))
df_combined <- df_combined %>%
mutate(
POST_treat_1 = post * treatment_zone1,
POST_treat_2 = post * treatment_zone2,
POST_treat_3 = post * treatment_zone3,
POST_treat_4 = post * treatment_zone4,
)
model1 <- feols(air_quality ~ post + treatment_zone1 + treatment_zone2 + treatment_zone3 + treatment_zone4 + POST_treat_1 + POST_treat_2 + POST_treat_3 + POST_treat_4 + prec + dir + velmedia + racha + year, data = df_combined, cluster = ~zone)
summary(model1)
## OLS estimation, Dep. Var.: air_quality
## Observations: 43,798
## Standard-errors: Clustered (zone)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7494.951212 339.824579 22.055354 2.5013e-05 ***
## post 5.393797 0.377512 14.287741 1.3939e-04 ***
## treatment_zone1 14.469147 0.250641 57.728554 5.3916e-07 ***
## treatment_zone2 14.367790 0.232390 61.826304 4.0992e-07 ***
## treatment_zone3 10.263769 0.150640 68.134516 2.7801e-07 ***
## treatment_zone4 -10.538701 0.439363 -23.986300 1.7918e-05 ***
## POST_treat_1 -2.561131 0.154686 -16.556935 7.7937e-05 ***
## POST_treat_2 -0.571681 0.147279 -3.881627 1.7817e-02 *
## POST_treat_3 -1.417991 0.120702 -11.747869 3.0035e-04 ***
## POST_treat_4 2.548084 0.078853 32.314438 5.4676e-06 ***
## prec -0.070765 0.030658 -2.308220 8.2193e-02 .
## dir 0.002055 0.007973 0.257692 8.0936e-01
## velmedia -2.583066 0.165140 -15.641672 9.7561e-05 ***
## racha -1.217143 0.203216 -5.989392 3.9078e-03 **
## year -3.693107 0.168336 -21.938964 2.5544e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 16.2 Adj. R2: 0.328812
data_post_policy <- read_csv("predictions_xgboost_POSTPOLICY_CITYCENTER_CV") |>
filter(fecha < as.Date("2021-05-01")) |>
filter(!(fecha >= as.Date("2020-02-01") & fecha <= as.Date("2020-07-31")))
## Rows: 44321 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): dia, station_name, station_type
## dbl (12): year, month, air_quality, longitude, latitude, altitud, tmed, pre...
## date (1): fecha
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Ajustar el nombre de las zonas
data_post_policy <- data_post_policy |>
mutate(zone = case_when(
station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla",
"Ramón y Cajal", "Cuatro Caminos", "Plaza de España",
"Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro",
"Parque del Retiro") ~ "Zone 1",
station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zona 2",
station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada",
"Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
TRUE ~ NA_character_
))
DESCRIPTIVE STATISTICS FOR MADRID
mean_no2 <- data_post_policy |>
summarise(
mean_no2= mean(air_quality, na.rm = TRUE),
sd_no2 = sd(air_quality, na.rm = TRUE),
mean_tmed=mean(tmed),
sd_tmed=sd(tmed),
mean_prec=mean(prec),
sd_prec=sd(prec),
mean_prec=mean(prec),
sd_prec=sd(prec),
mean_dir=mean(dir),
sd_dir=sd(dir)
)
print(mean_no2)
## # A tibble: 1 × 8
## mean_no2 sd_no2 mean_tmed sd_tmed mean_prec sd_prec mean_dir sd_dir
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 34.3 20.1 13.6 7.30 1.18 3.99 26.8 25.2
DESCRIPTIVE STATISTICS BY ZONE
mean_no2_zone<- data_post_policy |>
group_by(zone) |>
summarise(
mean_no2= mean(air_quality, na.rm = TRUE),
sd_no2 = sd(air_quality, na.rm = TRUE),
mean_tmed=mean(tmed),
sd_tmed=sd(tmed),
mean_prec=mean(prec),
sd_prec=sd(prec),
mean_prec=mean(prec),
sd_prec=sd(prec),
mean_racha=mean(racha),
sd_racha=sd(racha)
)
print(mean_no2_zone)
## # A tibble: 4 × 9
## zone mean_no2 sd_no2 mean_tmed sd_tmed mean_prec sd_prec mean_racha sd_racha
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Zona 2 38.4 20.9 14.2 7.47 1.2 4.06 9.56 3.96
## 2 Zone 1 36.5 20.1 13.5 7.19 1.20 4.11 9.56 4.02
## 3 Zone 3 31.9 18.6 13.4 7.37 1.12 3.79 9.97 4.33
## 4 Zone 4 18.7 13.1 12.6 7.05 1.11 3.74 8.53 4.01
2.1 AVERAGE TREATMENT ON TREATED (ATT) ESTIMATION
# Calculate the difference between observed and predicted air quality
data_with_difference <- data_post_policy |>
mutate(
difference_air_quality_predicted = air_quality - predicted_air_quality
)
# Calculate the overall mean of the difference <- ATT (AVERAGE TREATMENT ON TREATET)
mean_difference_general <- data_with_difference |>
summarise(
mean_difference = mean(difference_air_quality_predicted, na.rm = TRUE)
)
print(mean_difference_general)
## # A tibble: 1 × 1
## mean_difference
## <dbl>
## 1 -3.25
Calculating standar errors as Souza(2019) proposed
# Ajustar la diferencia restando la media general
data_with_adjusted_difference <- data_with_difference |>
mutate(
adjusted_difference = difference_air_quality_predicted - mean_difference_general$mean_difference
)
# Calcular el sumatorio de las diferencias ajustadas
sum_adjusted_difference_general <- data_with_adjusted_difference |>
summarise(
sum_adjusted_difference = sum(adjusted_difference, na.rm = TRUE)
)
# Leer datos de entrenamiento
train_data_20 <- read_csv("train_data_20.csv") |>
mutate(zone = case_when(
station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla",
"Ramón y Cajal", "Cuatro Caminos", "Plaza de España",
"Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro",
"Parque del Retiro") ~ "Zone 1",
station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zona 2",
station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada",
"Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
TRUE ~ NA_character_
))
## Rows: 5028 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station_name, station_type
## dbl (13): year, month, air_quality, longitude, latitude, altitud, tmed, pre...
## date (1): fecha
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Calcular el sumatorio de residuos de validación cruzada
sumatorio_cv_residuos_general <- train_data_20 |>
summarise(
sumatorio_cv_residuos = sum(cv_residuos, na.rm = TRUE)
)
# Calcular la varianza y el error estándar
standar_error_cv_general <- sum_adjusted_difference_general |>
mutate(
variance_cv = (sum_adjusted_difference + sumatorio_cv_residuos_general$sumatorio_cv_residuos)^2,
se_cv = sqrt(variance_cv)
)
# Mostrar los resultados
print(standar_error_cv_general)
## # A tibble: 1 × 3
## sum_adjusted_difference variance_cv se_cv
## <dbl> <dbl> <dbl>
## 1 1.07e-11 2.60 1.61
print(mean_difference_general)
## # A tibble: 1 × 1
## mean_difference
## <dbl>
## 1 -3.25
2.2 CONDITIONAL AVERAGE TREATMENT ON TREATED (C ATT) ESTIMATION BY ZONE
data_post_policy<- read_csv("predictions_xgboost_POSTPOLICY_CITYCENTER_CV")
## Rows: 44321 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): dia, station_name, station_type
## dbl (12): year, month, air_quality, longitude, latitude, altitud, tmed, pre...
## date (1): fecha
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_post_policy<- data_post_policy |>
filter(fecha < as.Date("2021-05-01"))
data_post_policy <- data_post_policy |>
filter(!(fecha >= as.Date("2020-02-01") & fecha <= as.Date("2020-07-31")))
data_post_policy <- data_post_policy |>
mutate(zone = case_when(
station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla",
"Ramón y Cajal", "Cuatro Caminos", "Plaza de España",
"Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro",
"Parque del Retiro") ~ "Zone 1",
station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zona 2",
station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada",
"Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
TRUE ~ NA_character_
))
data_with_difference <- data_post_policy |>
mutate(
difference_air_quality_predicted = air_quality - predicted_air_quality
)
mean_difference_by_zone <- data_with_difference |>
group_by(zone) |>
summarise(
mean_difference = mean(difference_air_quality_predicted, na.rm = TRUE)
)
print(mean_difference_by_zone) #C ATT BY ZONE
## # A tibble: 4 × 2
## zone mean_difference
## <chr> <dbl>
## 1 Zona 2 -2.28
## 2 Zone 1 -4.39
## 3 Zone 3 -3.35
## 4 Zone 4 -0.130
Calculating Standar Errors by zone as proposed by (Souza, 2019)
train_data_20<- read_csv("train_data_20.csv")
## Rows: 5028 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): station_name, station_type
## dbl (13): year, month, air_quality, longitude, latitude, altitud, tmed, pre...
## date (1): fecha
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
train_data_20 <- train_data_20 %>%
mutate(zone = case_when(
station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla",
"Ramón y Cajal", "Cuatro Caminos", "Plaza de España",
"Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro",
"Parque del Retiro") ~ "Zone 1",
station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zona 2",
station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada",
"Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
TRUE ~ NA_character_
))
data_with_difference <- data_post_policy |>
mutate(
difference_air_quality_predicted = air_quality - predicted_air_quality
)
mean_difference_by_zone <- data_with_difference |>
group_by(zone) |>
summarise(
mean_difference = mean(difference_air_quality_predicted, na.rm = TRUE)
)
# Step 3: Subtract the overall mean from each value in the difference column for each zone category
data_with_adjusted_difference <- data_with_difference |>
left_join(mean_difference_by_zone, by = "zone") |>
mutate(
adjusted_difference = difference_air_quality_predicted - mean_difference
)
# Step 4: Calculate the sum of adjusted differences for each zone category
sum_adjusted_difference_by_zone <- data_with_adjusted_difference |>
group_by(zone) |>
summarise(
sum_adjusted_difference = sum(adjusted_difference, na.rm = TRUE)
)
# Step 5: Calculate the sum of cv_residuos
sumatorio_cv_residuos_by_zone <- train_data_20 |>
group_by(zone) |>
summarise(
sumatorio_cv_residuos = sum(cv_residuos, na.rm = TRUE)
)
standar_error_cv_by_zone <- sum_adjusted_difference_by_zone |>
left_join(sumatorio_cv_residuos_by_zone, by = "zone") |>
mutate(
variance_cv = (sum_adjusted_difference + sumatorio_cv_residuos)^2
)
# Step 2: Calculate the standard error from the variance
standar_error_cv_by_zone <- standar_error_cv_by_zone |>
mutate(
se_cv = sqrt(variance_cv)
)
# Display the results
print(standar_error_cv_by_zone)
## # A tibble: 4 × 5
## zone sum_adjusted_difference sumatorio_cv_residuos variance_cv se_cv
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Zona 2 -1.82e-12 -0.466 0.217 0.466
## 2 Zone 1 -1.59e-12 -0.179 0.0322 0.179
## 3 Zone 3 1.94e-12 1.85 3.41 1.85
## 4 Zone 4 -1.48e-13 0.411 0.169 0.411
print(mean_difference_by_zone)
## # A tibble: 4 × 2
## zone mean_difference
## <chr> <dbl>
## 1 Zona 2 -2.28
## 2 Zone 1 -4.39
## 3 Zone 3 -3.35
## 4 Zone 4 -0.130
observations_per_zone <- data_post_policy %>%
count(zone)
print(observations_per_zone)
## # A tibble: 4 × 2
## zone n
## <chr> <int>
## 1 Zona 2 4206
## 2 Zone 1 7010
## 3 Zone 3 4206
## 4 Zone 4 1402
I repeat the analysis only with the post-COVID-19 data to perform a robustness analysis. This is necessary because the pandemic may have significantly altered the dynamics of key variables, such as air quality and traffic patterns. By focusing the analysis on the post-COVID period, we verify whether the original conclusions remain valid in a different context, ensuring the consistency and reliability of the results obtained.
df_combined <- read_csv("df_combined_complete.csv")
## Rows: 121693 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): town_code, month, dia, station_name, address, station_type, nombr...
## dbl (22): provincia, station_code.x, punto_muestreo, year, air_quality, cit...
## date (1): fecha
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_combined <- df_combined |> drop_na(air_quality)
df_combined <- df_combined |>
filter(fecha <= as.Date("2020-02-01"))
df_combined <- df_combined %>%
mutate(zone = case_when(
station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla",
"Ramón y Cajal", "Cuatro Caminos", "Plaza de España",
"Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro",
"Parque del Retiro") ~ "Zone 1",
station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zone 2",
station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada",
"Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
station_name %in% c("Valdemoro", "Arganda del Rey", "Collado Villalba", "Algete", "Colmenar Viejo", "Alcala de Henares", "San Martin de Valdeiglesias", "Villa del Prado") ~ "Control",
TRUE ~ "Excluded"
))
3.1 MADRID MUNICIPALITY BEFORE COVID-19
df_combined <- df_combined %>%
filter(zone != "Excluded")
model0 <- feols(air_quality ~ city_center + post_treat_mad_central + city_center*post_treat_mad_central + prec + dir + velmedia + racha +year + zone, data = df_combined, cluster = ~zone)
## The variable 'zoneZone 4' has been removed because of collinearity (see $collin.var).
summary(model0)
## OLS estimation, Dep. Var.: air_quality
## Observations: 36,063
## Standard-errors: Clustered (zone)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8704.547787 325.824846 26.715421 1.1670e-05
## city_center -6.505688 0.735674 -8.843169 9.0276e-04
## post_treat_mad_central 6.461463 0.213520 30.261679 7.1027e-06
## prec 0.023889 0.056481 0.422964 6.9407e-01
## dir 0.018242 0.009109 2.002678 1.1576e-01
## velmedia -2.598495 0.217360 -11.954781 2.8054e-04
## racha -1.409307 0.370445 -3.804367 1.9032e-02
## year -4.293628 0.162444 -26.431471 1.2177e-05
## zoneZone 1 23.782929 0.363100 65.499647 3.2548e-07
## zoneZone 2 24.647735 0.389773 63.236053 3.7460e-07
## zoneZone 3 20.264191 0.798603 25.374548 1.4324e-05
## city_center:post_treat_mad_central -1.564799 0.780581 -2.004660 1.1550e-01
##
## (Intercept) ***
## city_center ***
## post_treat_mad_central ***
## prec
## dir
## velmedia ***
## racha *
## year ***
## zoneZone 1 ***
## zoneZone 2 ***
## zoneZone 3 ***
## city_center:post_treat_mad_central
## ... 1 variable was removed because of collinearity (zoneZone 4)
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 16.3 Adj. R2: 0.377892
3.2 Assesing heterogeneity treatment effects. Differences in differences approach by zone before COVID-19
df_combined<- df_combined |>
mutate(year = as.factor(year), zone = as.factor(zone), post = as.factor(post_treat_mad_central))
df_combined <- df_combined |>
mutate(
post = as.numeric(as.character(post)),
year = as.numeric(as.character(year))
)
# Crear variables de tratamiento para cada zona
df_combined <- df_combined |>
mutate(
treatment_zone1 = ifelse(zone == "Zone 1", 1, 0),
treatment_zone2 = ifelse(zone == "Zone 2", 1, 0),
treatment_zone3 = ifelse(zone == "Zone 3", 1, 0),
treatment_zone4 = ifelse(zone == "Zone 4", 1, 0))
df_combined <- df_combined %>%
mutate(
POST_treat_1 = post * treatment_zone1,
POST_treat_2 = post * treatment_zone2,
POST_treat_3 = post * treatment_zone3,
POST_treat_4 = post * treatment_zone4,
)
model1 <- feols(air_quality ~ post + treatment_zone1 + treatment_zone2 + treatment_zone3 + treatment_zone4 + POST_treat_1 + POST_treat_2 + POST_treat_3 + POST_treat_4 + prec + dir + velmedia + racha + year, data = df_combined, cluster = ~zone)
summary(model1)
## OLS estimation, Dep. Var.: air_quality
## Observations: 36,063
## Standard-errors: Clustered (zone)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8706.282701 324.529304 26.827416 1.1477e-05 ***
## post 6.462936 0.212669 30.389699 6.9842e-06 ***
## treatment_zone1 17.775248 0.385205 46.144952 1.3192e-06 ***
## treatment_zone2 17.694147 0.354669 49.889121 9.6597e-07 ***
## treatment_zone3 13.687691 0.168527 81.219434 1.3774e-07 ***
## treatment_zone4 -7.416719 0.711130 -10.429488 4.7746e-04 ***
## POST_treat_1 -2.868307 0.158398 -18.108271 5.4685e-05 ***
## POST_treat_2 -0.384260 0.164323 -2.338442 7.9514e-02 .
## POST_treat_3 -1.373467 0.336740 -4.078714 1.5114e-02 *
## POST_treat_4 0.835244 0.186240 4.484773 1.0950e-02 *
## prec 0.024003 0.056550 0.424460 6.9306e-01
## dir 0.018071 0.009051 1.996602 1.1657e-01
## velmedia -2.596305 0.217924 -11.913809 2.8433e-04 ***
## racha -1.409408 0.370681 -3.802210 1.9067e-02 *
## year -4.294488 0.161803 -26.541462 1.1977e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 16.2 Adj. R2: 0.378484
data_post_policy<- read_csv("predictions_xgboost_POSTPOLICY_CITYCENTER_CV")
## Rows: 44321 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): dia, station_name, station_type
## dbl (12): year, month, air_quality, longitude, latitude, altitud, tmed, pre...
## date (1): fecha
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_post_policy <- data_post_policy |>
filter(fecha <= as.Date("2020-02-01"))
data_post_policy <- data_post_policy |>
mutate(zone = case_when(
station_name %in% c("Escuelas Aguirre", "Castellana", "Plaza Castilla",
"Ramón y Cajal", "Cuatro Caminos", "Plaza de España",
"Barrio del Pilar", "Plaza del Carmen", "Méndez Álvaro",
"Parque del Retiro") ~ "Zone 1",
station_name %in% c("Plaza Elíptica", "Farolillo", "Villaverde", "Moratalaz", "Vallecas", "Ensanche de Vallecas") ~ "Zona 2",
station_name %in% c("Arturo Soria", "Sanchinarro", "Urb. Embajada",
"Barajas Pueblo", "Tres Olivos", "Juan Carlos I") ~ "Zone 3",
station_name %in% c("El Pardo", "Casa de Campo") ~ "Zone 4",
TRUE ~ NA_character_
))
# Agrupar por zona, año y mes, y calcular medias mensuales
data_post_policy_monthly <- data_post_policy |>
group_by(year, month, zone) |>
summarise(
mean_air_quality = mean(air_quality, na.rm = TRUE),
mean_predicted_air_quality = mean(predicted_air_quality, na.rm = TRUE),
.groups = 'drop'
)
# Calcular la diferencia mensual
data_with_difference_monthly <- data_post_policy_monthly |>
mutate(
difference_air_quality_predicted = mean_air_quality - mean_predicted_air_quality
)
4.1 AVERAGE TREATMENT ON TREATED BEFORE COVID-19
mean_difference_general <- data_with_difference_monthly |>
summarise(
mean_difference = mean(difference_air_quality_predicted, na.rm = TRUE)
)
print(mean_difference_general)
## # A tibble: 1 × 1
## mean_difference
## <dbl>
## 1 -3.17
4.2 CONDITIONAL AVERAGE TREATMENT ON TREATED BY ZONE BEFORE COVID-19
mean_difference_by_zone <- data_with_difference_monthly |>
group_by(zone) |>
summarise(
mean_difference = mean(difference_air_quality_predicted, na.rm = TRUE),
.groups = 'drop'
)
print(mean_difference_by_zone)
## # A tibble: 4 × 2
## zone mean_difference
## <chr> <dbl>
## 1 Zona 2 -3.00
## 2 Zone 1 -5.52
## 3 Zone 3 -3.33
## 4 Zone 4 -0.824
observations_per_zone <- data_post_policy %>%
count(zone)
# Mostrar los resultados
print(observations_per_zone)
## # A tibble: 4 × 2
## zone n
## <chr> <int>
## 1 Zona 2 2574
## 2 Zone 1 4290
## 3 Zone 3 2574
## 4 Zone 4 858